In [1]:
# Prepare Library
import pandas as pd
import numpy as np
import country_converter as coco
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')
import nltk

%matplotlib inline
In [2]:
# Load Dataset tahun 2023
df = pd.read_csv('../datacsv/ds_salaries.csv')
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)
In [3]:
print(df.shape)
df.head()
(3755, 9)
Out[3]:
work_year experience_level employment_type job_title salary_in_usd employee_residence remote_ratio company_location company_size
0 2023 SE FT Principal Data Scientist 85847 ES 100 ES L
1 2023 MI CT ML Engineer 30000 US 100 US S
2 2023 MI CT ML Engineer 25500 US 100 US S
3 2023 SE FT Data Scientist 175000 CA 100 CA M
4 2023 SE FT Data Scientist 120000 CA 100 CA M
In [4]:
df.isnull().sum()
Out[4]:
work_year             0
experience_level      0
employment_type       0
job_title             0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
In [5]:
df['experience_level'] = df['experience_level'].replace('EN','Entry-level/Junior')
df['experience_level'] = df['experience_level'].replace('MI','Mid-level/Intermediate')
df['experience_level'] = df['experience_level'].replace('SE','Senior-level/Expert')
df['experience_level'] = df['experience_level'].replace('EX','Executive-level/Director')

ex_level = df['experience_level'].value_counts()
fig = px.treemap(ex_level, path = [ex_level.index], values = ex_level.values, 
                title = 'Experience Level')
fig.show()
In [6]:
print('Different job designations altogether :', len(set(df['job_title'])))
Different job designations altogether : 93
In [7]:
top15_job_titles = df['job_title'].value_counts()[:15]
fig = px.bar(y = top15_job_titles.values, x = top15_job_titles.index, 
            text = top15_job_titles.values, title = 'Top 15 Job Designations')
fig.update_layout(xaxis_title = "Job Designations", yaxis_title = "Count")
fig.show()
In [8]:
def Freq_df(word_list):
    Freq_dist_nltk = nltk.FreqDist(word_list)
    df_freq = pd.DataFrame.from_dict(Freq_dist_nltk, orient = 'index')
    df_freq.columns = ['Frequency']
    df_freq.index.name = 'Term'
    df_freq = df_freq.sort_values(by = ['Frequency'], ascending = False)
    df_freq = df_freq.reset_index()
    return df_freq
In [9]:
def Word_Cloud(data, title):
    plt.figure(figsize = (20,15))
    wc = WordCloud(width = 1200, height = 600, max_words = 50,
               background_color = 'white',
               max_font_size = 100, random_state = 42)
    wc.generate_from_frequencies(data)
    plt.imshow(wc)
    plt.title(title)
    plt.axis('off')
    plt.show()
In [10]:
freq_df = Freq_df(df['job_title'].values.tolist())
data = dict(zip(freq_df['Term'].tolist(), freq_df['Frequency'].tolist()))
data = freq_df.set_index('Term').to_dict()['Frequency']

Word_Cloud(data , 'WordCloud of job designations')
In [ ]: